---
title:  "Replication Material: What's in a Buzzword? - A Systematic Review of the State of Populism Research in Political Science"
author: "Sophia Hunger and Fred Paxton"
date:  "`r format(Sys.time(), '%d %B, %Y')`"
always_allow_html: yes
output: pdf_document
geometry: margin = 1in
fontsize: 12pt
tables: true
classoption: a4paper
header-includes:
        - \usepackage{booktabs}
        - \usepackage{longtable}
        - \usepackage{array}
        - \usepackage{multirow}
        - \usepackage{wrapfig}
        - \usepackage{float}
        - \usepackage{colortbl}
        - \usepackage{pdflscape}
        - \usepackage{tabu}
        - \usepackage{threeparttable}
        - \usepackage{threeparttablex}
        - \usepackage[normalem]{ulem}
        - \usepackage{makecell}
        - \usepackage{xcolor}
        - \usepackage{graphicx}
        - \usepackage{setspace}\doublespacing
        - \usepackage{placeins}
        - \usepackage{amssymb}
        - \usepackage{pifont}
        - \providecommand*{\checkmark}{\ding{51}}
bibliography: [library.bib, coding_both.bib]
csl: chicago-modified.csl
documentclass: article
spacing: double
---

\newpage

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE, 
                      warning = FALSE, 
                      message = FALSE,
                      fig.pos = 'H',
                      out.extra = '')
p_needed <- c("lubridate", "stringi", "dplyr", 
              "stringr", "openxlsx", "reshape", 
              "purrr", "writexl", "ggplot2", 
              "textstem", "ggrepel", "scales", 
              "kableExtra", "haven", "hrbrthemes", 
              "DT", "topicmodels", "quanteda", "hrbrthemes", 
              "tidyr", "quanteda", "ggpubr", "viridis",  
              "readxl", "quanteda.textmodels", "cowplot")
lapply(p_needed, require, character.only = TRUE)

log_open("buzzword_replication.log")

```


```{r load-data, include=FALSE}
# load cleaned data on article level
load("data_review_cleaned_v1.Rdata")

# load data aggregated to discplines
load("data_review_disciplines_v1.Rdata")

# load wide data for disciplines (used for graph)
load("data_review_disciplines_wide.Rdata")

#create data frame for polisci
wos.clean.filtered$polisci <- str_detect(wos.clean.filtered$woscat, "Political Science")
wos.polisci <- wos.clean.filtered %>% filter(polisci == T)

log_close()
```



# Table for disciplines

```{r table-disciplines}

# split large column in two
t1 <- wos.dis.agg[1:(nrow(wos.dis.agg)/2),]
t2 <- wos.dis.agg[(nrow(wos.dis.agg)/2+1):nrow(wos.dis.agg),]


list(t1,t2) %>% kable("latex", booktabs =T, row.names = FALSE, caption = "Total number of published journal articles by disciplines \\label{tab:disciplines}") %>%
  kable_styling(latex_options ="hold_position")

```


\newpage

# Line graph for disciplines over time

```{r yearly, fig.align='center', fig.show="hold", fig.cap="\\label{fig:yearly}Yearly number of published journal articles on populism across disciplines", fig.pos="!htp"}


# aggregate to get time trend #####
# make a different measure in which the "other"-papers are counted only once
# which is not the case with the other measure above, bc wos categories are not
# mutually exclusive, this inflates the number

wos.dis <- wos.dis %>% dplyr::select(`Political Science`, 
                                     `International Relations`, 
                                     Sociology, 
                                     Communication, 
                                     `History`,
                                     `Area Studies`,
                                     Economics,
                                     year)

wos.dis$other <- 0
wos.dis$other[wos.dis$`Political Science` == 0 &
             wos.dis$`International Relations` == 0 &
              wos.dis$Sociology == 0 &
              wos.dis$Communication == 0 &
              wos.dis$History == 0 & 
              wos.dis$`Area Studies` == 0 &
              wos.dis$Economics == 0] <- 1

# aggregate
wos.dis.agg2 <- wos.dis %>% group_by(year) %>% summarise_all(funs(sum), na.rm = T)
#delete 2018
wos.dis.agg2 <- wos.dis.agg2 %>% filter(year < 2019)

# reshape data
wos.dis.agg2 <- as.data.frame(wos.dis.agg2)
wos.dis.agg3 <- melt(wos.dis.agg2, id.vars="year")

library(ggplot2)
#make plot
ggplot(wos.dis.agg3, aes(year, value)) + 
    geom_line(aes(linetype = variable, color = variable), size = 1) +
    theme_minimal() +
    ylab("Papers published per year") + xlab("Year") + 
  scale_color_manual(name = "Discipline",
                     values = c('black', 'black', 'black', 'black', 'grey', 'grey', 'grey', 'grey')) +
  scale_linetype_manual(name = "Discipline", 
                        values = c("solid", "dashed", "dotted", "dotdash", "solid", "dashed", 'dotted', "dotdash"))+ 
  theme(legend.key.width = unit(1.0,"cm"))+ 
  guides(manual = guide_legend(override.aes = list(size=10)))

```


\newpage

# Line graphs for geographical distribution

```{r, newsmap, echo=F, fig.align='center', fig.show="hold", fig.cap="\\label{fig:region}Yearly number of published journal articles on populism by regional focus", fig.pos="!htp"}

# prepare data for dictionary
corpus_1 <- corpus(wos.polisci, text_field = "abstract", docid_field = "id")
token <- quanteda::tokens(corpus_1, remove_numbers = T, remove_punct = T, remove_symbols = T)
dfm <- dfm(token, tolower = TRUE, remove = stopwords("english"), remove_punct = TRUE)


##### use the dictionary
# geographical dictionary - pre-defined
# from github ()
# already saved in local participant_materials folder
newsmap_dict <- dictionary(file = "newmap.yml",
                           format = "YAML")

#############################################################################
# apply one level of the dictionary (continent)
region_dfm <- dfm_lookup(dfm, newsmap_dict, levels = 1, valuetype = "glob")


# bind results on dataset
region.results <- convert(region_dfm, to = c("data.frame"))
region.results$id <-  region.results$doc_id
region <- full_join(wos.polisci, region.results, by = "id" )

###########################################################################
# apply sub-continental dictionay
subregion_dfm <- dfm_lookup(dfm, newsmap_dict, levels = 1:2, valuetype = "glob")


# bind results on dataset
subregion.results <- convert(subregion_dfm, to = c("data.frame"))
subregion.results$id <- subregion.results$doc_id
region.full <- full_join(region, subregion.results, by = "id" )


# create overview: Clearly Europe, Europe and US, only Latin America
# Latin America and Europe/US, Other
region.full$LA <- if_else((region.full$AMERICA.CENTER > 0 |
                           region.full$AMERICA.SOUTH > 0 |
                            region.full$AMERICA.CARIB > 0), 1, 0)

region.full$EURO <- if_else((region.full$EUROPE > 0), 1, 0)
 
region.full$EASTEURO <- if_else((region.full$EUROPE.EAST > 0), 1, 0)
                          
region.full$NORTHAMERICA <- if_else((region.full$AMERICA.NORTH > 0), 1, 0)

region.full$OTHER <- if_else((region.full$ASIA > 0 |
                              region.full$OCEANIA > 0 |
                              region.full$AFRICA > 0), 1, 0)

region.full$NOINFO <- if_else((region.full$ASIA == 0 &
                              region.full$OCEANIA == 0 &
                              region.full$AFRICA == 0 &
                              region.full$EUROPE == 0 &
                              region.full$AMERICA == 0), 1, 0)



# aggregate
region.agg <- region.full %>% dplyr::select(LA, EURO, NORTHAMERICA, EASTEURO, OTHER, NOINFO, year) %>%
  group_by(year) %>%
  summarise(LA = sum(LA),
            EURO = sum(EURO),
            OTHER = sum(OTHER),
            NOINFO = sum(NOINFO),
            NORTHAMERICA = sum(NORTHAMERICA),
            EASTEURO = sum(EASTEURO))

# data needs to e re-shaped
reg.agg <- region.agg %>% gather(Region, value, LA:NORTHAMERICA) 


# plot
ggplot(reg.agg, aes(year, value)) +
    geom_line(aes(linetype = Region, color = Region), size = 1) +
    theme_minimal() +
    ylab("Papers published per year") + xlab("Year")  +
  scale_color_manual(name = "Regional Focus",
                     values = c('black', 'black', 'black', 'grey', 'grey'),
                     labels = c("Europe", "Latin America", "No information",
               "North America", "Afrika, Asia, Oceania")) +
  scale_linetype_manual(name = "Regional Focus", 
                        values = c("solid", "dashed", "dotted", "solid", "dashed"), 
                        labels = c("Europe", "Latin America", "No information",
               "North America", "Afrika, Asia, Oceania")) +
  theme(legend.key.size = unit(2,"line"))


```


\newpage

# Wordfish 

```{r}

# prepare dfm
dfm2 <- token %>% quanteda::dfm() %>% dfm_select(min_nchar = 2L) %>% 
  dfm_trim(min_docfreq = 5) %>%  # minimum 50 documents (removes rare terms)
  dfm_trim(max_docfreq = 0.25,
           docfreq_type = 'prop')

wf <- textmodel_wordfish(dfm2, dir = c(2, 1), sparse = F)

# get features
wf_terms <- tibble(feature = wf$features, beta = wf$beta, 
                   psi = wf$psi)
wf_groups <- tibble(docs = wf$docs, theta = wf$theta,
                    theta_se = wf$se.theta)


# select terms to show in plot
# for different divides

show_methods <- c("controlling", "panel", "attitudinal", "multilevel", "estimate", "sample", "individual-level", "hypotheses", "confirm", "mediated", "survey", "longitudinal", "likelihood", "predictors", "effects", "hegemonic", "writings", "essay", "reading", "paradigm", "paradigm", "sociology", "conception", "critique", "reflecting", "understandings", "describes", "constructed", "trajectory", "intellectual", "laclau’s", "laclau", "accounting", "determinants", "propensity", "hypotheses")

show_theory <- c("")

show_ideology <- c("anti-immigrant", "socialism", "classes", "marxist", "republicanism", "agrarian", "rightist", "prr", "immigrants", "revolutionary", "coup", "communities", "resistance", 
"autonomy", "marginalized", "islam", "nativism")

show_geo <- c("south", "african", "africa", "venezuela’s", "thaksin", "thai", "venezuelan", "chinese", "morales", "caribbean", "kirchner", "asia", "russia", "pvv", "svp", "swiss", "belgium", 
"sweden", "danish", "finns", "wilders", "brazil", "chavez", "pim", "fortuyn", "movimento", "finland", "switzerland", "wilders")

plot_methods <- wf_terms %>%
  ggplot(aes(x = beta, y = psi)) +
  geom_point(color = "#C0C0C0") +
  geom_text_repel(data = wf_terms %>% filter(feature %in% show_methods),
                  aes(label = feature),
                  force = 1, direction = 'both',
                  size = 3,
                  arrow = arrow(length = unit(0.01, "npc")),
                  segment.colour="grey10") +
                  theme_minimal() +
                  ylab("Term Fixed Effect") +
                  xlab("Term Weight") +
                  ggtitle("highlighted words: methodological differences")


plot_ideology <- wf_terms %>%
  ggplot(aes(x = beta, y = psi)) +
  geom_point(color = "#C0C0C0") +
  geom_text_repel(data = wf_terms %>% filter(feature %in% show_ideology),
                  aes(label = feature),
                  force = 1, direction = 'both',
                  size = 3,
                  arrow = arrow(length = unit(0.01, "npc")),
                  segment.colour="grey10") +
                  theme_minimal() +
                  ylab("Term Fixed Effect") +
                  xlab("Term Weight") +
                  ggtitle("highlighted words: host ideologies")


plot_geo <- wf_terms %>%
  ggplot(aes(x = beta, y = psi)) +
  geom_point(color = "#C0C0C0") +
  geom_text_repel(data = wf_terms %>% filter(feature %in% show_geo),
                  aes(label = feature),
                  force = 1, direction = 'both',
                  size = 3,
                  arrow = arrow(length = unit(0.01, "npc")),
                  segment.colour="grey10") +
                  theme_minimal() +
                  ylab("Term Fixed Effect") +
                  xlab("Term Weight") +
                  ggtitle("highlighted words: geographical differences")

```



```{r wordfish1, fig.align='center', fig.show="hold", fig.cap="\\label{figpos}Wordfish features plotted by weight and frequency",fig.width=7,fig.height=10,fig.pos="!htp"}

cowplot::plot_grid(plot_geo, plot_methods, plot_ideology, nrow =3,
                   labels = "auto")

```



\pagebreak


# Online Appendix


## Publications over time

```{r overtime, fig.align='center', fig.show="hold", fig.cap="\\label{fig:overtime}Number of articles published per year from WoS", fig.pos="!htp"}

overtime <- read_excel("table_years.xlsx")

overtime$year <- as.numeric(overtime$year)

ggplot(overtime, aes(year, publications)) + 
    geom_line() +
  theme_minimal()
```



\newpage

## Publications by journal


```{r journaltab}
# Number of publications by journal

# journals in polisci
wos.polisci$number <- 1

# reshape data
tablejj <- wos.polisci %>% 
  dplyr::select(journal, number) %>% 
  dplyr::group_by(journal) %>% 
  dplyr::summarize(Frequency = sum(number)) %>%
  dplyr::rename(Journal = journal)

#filter and arrange
tablej1 <- tablejj %>% 
  filter(Frequency > 10) %>% 
  arrange(desc(Frequency))

tablej2 <- tablejj %>% 
  filter(Frequency <= 10) %>% 
  dplyr::summarize(Frequency = sum(Frequency))

tablej2$Journal <- "other"
tablej <- rbind(tablej1, tablej2)

# change variable "Journals" from uppercase to capitalized
tablej$Journal <- stri_trans_totitle(tablej$Journal)

tablej %>% 
  kable("latex", booktabs =T, caption = "\\label{journalnumber}Numbers of articles on populism per journal, more than 10 articles from 2004 to 2018") %>%
  kable_styling(latex_options = "hold")



```

\pagebreak


# Features of Wordfish model


```{r, eval = TRUE}
# show features that load most heavily on both ends of the scale 

head_terms <- wf_terms %>% 
              arrange(beta) %>% 
              head(40)
tail_terms <- wf_terms %>% 
              arrange(-beta) %>% 
              head(40)

top_terms <-cbind(head_terms, tail_terms) 

top_terms %>% kable("latex", booktabs =T, longtable = T,
                    caption = "\\label{features}Selection of top negative and positives features") %>% 
                    add_header_above(c("Negative features" = 3, 
                                       "Positive features" = 3)) %>% 
                    kable_styling(latex_options =c("repeat_header"))

```


\pagebreak


# Results for two-dimensional scaling

```{r, eval = T}

pop_ca <- textmodel_ca(dfm2, nd = 2, sparse =T)
ca <- coef(pop_ca, doc_dim = c(1,2), feat_dim = c(1,2))
 
ca_terms <- ca$coef_feature %>% as_tibble() %>% 
  mutate(term =  rownames(ca$coef_feature), Dim1 = Dim1 * -1) # adjust order similar to that of the wordfish model

ca_docs <- ca$coef_document %>% as_tibble() %>%
   mutate(doc =  rownames(ca$coef_document), Dim1 = Dim1 * -1) %>% # adjust order similar to that of the wordfish model 
   bind_cols(docvars(dfm2))
 
ca_docs$title <- wos.polisci$title
```

```{r, eval = T, fig.cap="\\label{twodim}Results for two-dimensional scaling/correspondence analysis (CA)", fig.pos="!htp"}

ca_terms %>% 
   ggplot(aes(x = Dim1, y = Dim2)) +  
   geom_point(color = "#C0C0C0", show.legend = FALSE) + 
   scale_y_continuous(breaks= pretty_breaks(n = 5)) + 
   scale_x_continuous(breaks= pretty_breaks(n = 6)) + 
   labs(x = 'CA Dimension 1', y = 'CA Dimension 2') + 
  theme_minimal()
 

```


\newpage

\FloatBarrier

## Impact factors of hand-coded sample

```{r impactfactors, eval = T, fig.cap="\\label{sample_factor}Density of sample and population of political science papers", fig.pos="!htp"}

merged_with_impact <- read_excel("merged_with_impact.xlsx")

merged_with_impact$impact <- as.numeric(merged_with_impact$impact)

# load coded sample for merging with impact
coding <- read_excel("coded_sample.xlsx")

# load impact factors
impact <- read_excel("incites_all_clean.xlsx")

# prepare string variables in both documents
impact$journal <- tolower(impact$journal)
coding$journal <- tolower(coding$journal)
coding$journal <- str_remove_all(coding$journal, "\r\n")

# merge
impact_sample <- as.data.frame(left_join(coding, impact))
impact_sample$impact <- as.numeric(impact_sample$impact)

# select relevant variables
impact_sample <- impact_sample %>%
  dplyr::select(title, impact)

merged_with_impact <- merged_with_impact %>%
  dplyr::select(title, impact)

# prepare for plotting
impact_sample$thing <- "Random Sample"
merged_with_impact$thing <- "Population"
impact_data <- rbind(impact_sample, merged_with_impact)

# plot
ggplot(data = impact_data, aes(x = impact)) + 
  geom_density(aes(fill = thing,
                   alpha = thing)) +
  scale_fill_manual(values = c("light blue", "purple")) +
  scale_alpha_manual(values = c(1, 0.25)) +
  xlab("Complete Automation Probability") +
  ylab("Density") +
  theme(legend.title = element_blank()) +
  theme_minimal()

```



```{r}
savehistory(file)
```



